fastq filesThis step is performed on the O2 cluster. The fastq file quality was checked using FastQC and MultiQC. They are aligned to Ensembl mm10 genome and counted using Salmon pseudoaligner. Output sf files were transfered from O2 to local machine for further processing in R.
suppressMessages(
c(library(DESeq2),
library(limma),
library(tximport),
library(gridExtra),
library(ensembldb),
library(EnsDb.Mmusculus.v79),
library(grid),
library(ggplot2),
library(lattice),
library(reshape),
library(mixOmics),
library(gplots),
library(RColorBrewer),
library(readr),
library(dplyr),
library(VennDiagram),
library(clusterProfiler),
library(DOSE),
library(org.Mm.eg.db),
library(pathview),
library(AnnotationDbi),
library(knitr))
)
## [1] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [4] "BiocParallel" "matrixStats" "Biobase"
## [7] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [10] "S4Vectors" "BiocGenerics" "parallel"
## [13] "stats4" "stats" "graphics"
## [16] "grDevices" "utils" "datasets"
## [19] "methods" "base" "limma"
## [22] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [25] "BiocParallel" "matrixStats" "Biobase"
## [28] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [31] "S4Vectors" "BiocGenerics" "parallel"
## [34] "stats4" "stats" "graphics"
## [37] "grDevices" "utils" "datasets"
## [40] "methods" "base" "tximport"
## [43] "limma" "DESeq2" "SummarizedExperiment"
## [46] "DelayedArray" "BiocParallel" "matrixStats"
## [49] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [52] "IRanges" "S4Vectors" "BiocGenerics"
## [55] "parallel" "stats4" "stats"
## [58] "graphics" "grDevices" "utils"
## [61] "datasets" "methods" "base"
## [64] "gridExtra" "tximport" "limma"
## [67] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [70] "BiocParallel" "matrixStats" "Biobase"
## [73] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [76] "S4Vectors" "BiocGenerics" "parallel"
## [79] "stats4" "stats" "graphics"
## [82] "grDevices" "utils" "datasets"
## [85] "methods" "base" "ensembldb"
## [88] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [91] "gridExtra" "tximport" "limma"
## [94] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [97] "BiocParallel" "matrixStats" "Biobase"
## [100] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [103] "S4Vectors" "BiocGenerics" "parallel"
## [106] "stats4" "stats" "graphics"
## [109] "grDevices" "utils" "datasets"
## [112] "methods" "base" "EnsDb.Mmusculus.v79"
## [115] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [118] "AnnotationDbi" "gridExtra" "tximport"
## [121] "limma" "DESeq2" "SummarizedExperiment"
## [124] "DelayedArray" "BiocParallel" "matrixStats"
## [127] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [130] "IRanges" "S4Vectors" "BiocGenerics"
## [133] "parallel" "stats4" "stats"
## [136] "graphics" "grDevices" "utils"
## [139] "datasets" "methods" "base"
## [142] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [145] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [148] "gridExtra" "tximport" "limma"
## [151] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [154] "BiocParallel" "matrixStats" "Biobase"
## [157] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [160] "S4Vectors" "BiocGenerics" "parallel"
## [163] "stats4" "stats" "graphics"
## [166] "grDevices" "utils" "datasets"
## [169] "methods" "base" "ggplot2"
## [172] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [175] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [178] "gridExtra" "tximport" "limma"
## [181] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [184] "BiocParallel" "matrixStats" "Biobase"
## [187] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [190] "S4Vectors" "BiocGenerics" "parallel"
## [193] "stats4" "stats" "graphics"
## [196] "grDevices" "utils" "datasets"
## [199] "methods" "base" "lattice"
## [202] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [205] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [208] "AnnotationDbi" "gridExtra" "tximport"
## [211] "limma" "DESeq2" "SummarizedExperiment"
## [214] "DelayedArray" "BiocParallel" "matrixStats"
## [217] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [220] "IRanges" "S4Vectors" "BiocGenerics"
## [223] "parallel" "stats4" "stats"
## [226] "graphics" "grDevices" "utils"
## [229] "datasets" "methods" "base"
## [232] "reshape" "lattice" "ggplot2"
## [235] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [238] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [241] "gridExtra" "tximport" "limma"
## [244] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [247] "BiocParallel" "matrixStats" "Biobase"
## [250] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [253] "S4Vectors" "BiocGenerics" "parallel"
## [256] "stats4" "stats" "graphics"
## [259] "grDevices" "utils" "datasets"
## [262] "methods" "base" "mixOmics"
## [265] "MASS" "reshape" "lattice"
## [268] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [271] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [274] "AnnotationDbi" "gridExtra" "tximport"
## [277] "limma" "DESeq2" "SummarizedExperiment"
## [280] "DelayedArray" "BiocParallel" "matrixStats"
## [283] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [286] "IRanges" "S4Vectors" "BiocGenerics"
## [289] "parallel" "stats4" "stats"
## [292] "graphics" "grDevices" "utils"
## [295] "datasets" "methods" "base"
## [298] "gplots" "mixOmics" "MASS"
## [301] "reshape" "lattice" "ggplot2"
## [304] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [307] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [310] "gridExtra" "tximport" "limma"
## [313] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [316] "BiocParallel" "matrixStats" "Biobase"
## [319] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [322] "S4Vectors" "BiocGenerics" "parallel"
## [325] "stats4" "stats" "graphics"
## [328] "grDevices" "utils" "datasets"
## [331] "methods" "base" "RColorBrewer"
## [334] "gplots" "mixOmics" "MASS"
## [337] "reshape" "lattice" "ggplot2"
## [340] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [343] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [346] "gridExtra" "tximport" "limma"
## [349] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [352] "BiocParallel" "matrixStats" "Biobase"
## [355] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [358] "S4Vectors" "BiocGenerics" "parallel"
## [361] "stats4" "stats" "graphics"
## [364] "grDevices" "utils" "datasets"
## [367] "methods" "base" "readr"
## [370] "RColorBrewer" "gplots" "mixOmics"
## [373] "MASS" "reshape" "lattice"
## [376] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [379] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [382] "AnnotationDbi" "gridExtra" "tximport"
## [385] "limma" "DESeq2" "SummarizedExperiment"
## [388] "DelayedArray" "BiocParallel" "matrixStats"
## [391] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [394] "IRanges" "S4Vectors" "BiocGenerics"
## [397] "parallel" "stats4" "stats"
## [400] "graphics" "grDevices" "utils"
## [403] "datasets" "methods" "base"
## [406] "dplyr" "readr" "RColorBrewer"
## [409] "gplots" "mixOmics" "MASS"
## [412] "reshape" "lattice" "ggplot2"
## [415] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [418] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [421] "gridExtra" "tximport" "limma"
## [424] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [427] "BiocParallel" "matrixStats" "Biobase"
## [430] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [433] "S4Vectors" "BiocGenerics" "parallel"
## [436] "stats4" "stats" "graphics"
## [439] "grDevices" "utils" "datasets"
## [442] "methods" "base" "VennDiagram"
## [445] "futile.logger" "dplyr" "readr"
## [448] "RColorBrewer" "gplots" "mixOmics"
## [451] "MASS" "reshape" "lattice"
## [454] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [457] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [460] "AnnotationDbi" "gridExtra" "tximport"
## [463] "limma" "DESeq2" "SummarizedExperiment"
## [466] "DelayedArray" "BiocParallel" "matrixStats"
## [469] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [472] "IRanges" "S4Vectors" "BiocGenerics"
## [475] "parallel" "stats4" "stats"
## [478] "graphics" "grDevices" "utils"
## [481] "datasets" "methods" "base"
## [484] "clusterProfiler" "VennDiagram" "futile.logger"
## [487] "dplyr" "readr" "RColorBrewer"
## [490] "gplots" "mixOmics" "MASS"
## [493] "reshape" "lattice" "ggplot2"
## [496] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [499] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [502] "gridExtra" "tximport" "limma"
## [505] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [508] "BiocParallel" "matrixStats" "Biobase"
## [511] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [514] "S4Vectors" "BiocGenerics" "parallel"
## [517] "stats4" "stats" "graphics"
## [520] "grDevices" "utils" "datasets"
## [523] "methods" "base" "DOSE"
## [526] "clusterProfiler" "VennDiagram" "futile.logger"
## [529] "dplyr" "readr" "RColorBrewer"
## [532] "gplots" "mixOmics" "MASS"
## [535] "reshape" "lattice" "ggplot2"
## [538] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [541] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [544] "gridExtra" "tximport" "limma"
## [547] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [550] "BiocParallel" "matrixStats" "Biobase"
## [553] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [556] "S4Vectors" "BiocGenerics" "parallel"
## [559] "stats4" "stats" "graphics"
## [562] "grDevices" "utils" "datasets"
## [565] "methods" "base" "org.Mm.eg.db"
## [568] "DOSE" "clusterProfiler" "VennDiagram"
## [571] "futile.logger" "dplyr" "readr"
## [574] "RColorBrewer" "gplots" "mixOmics"
## [577] "MASS" "reshape" "lattice"
## [580] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [583] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [586] "AnnotationDbi" "gridExtra" "tximport"
## [589] "limma" "DESeq2" "SummarizedExperiment"
## [592] "DelayedArray" "BiocParallel" "matrixStats"
## [595] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [598] "IRanges" "S4Vectors" "BiocGenerics"
## [601] "parallel" "stats4" "stats"
## [604] "graphics" "grDevices" "utils"
## [607] "datasets" "methods" "base"
## [610] "pathview" "org.Hs.eg.db" "org.Mm.eg.db"
## [613] "DOSE" "clusterProfiler" "VennDiagram"
## [616] "futile.logger" "dplyr" "readr"
## [619] "RColorBrewer" "gplots" "mixOmics"
## [622] "MASS" "reshape" "lattice"
## [625] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [628] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [631] "AnnotationDbi" "gridExtra" "tximport"
## [634] "limma" "DESeq2" "SummarizedExperiment"
## [637] "DelayedArray" "BiocParallel" "matrixStats"
## [640] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [643] "IRanges" "S4Vectors" "BiocGenerics"
## [646] "parallel" "stats4" "stats"
## [649] "graphics" "grDevices" "utils"
## [652] "datasets" "methods" "base"
## [655] "pathview" "org.Hs.eg.db" "org.Mm.eg.db"
## [658] "DOSE" "clusterProfiler" "VennDiagram"
## [661] "futile.logger" "dplyr" "readr"
## [664] "RColorBrewer" "gplots" "mixOmics"
## [667] "MASS" "reshape" "lattice"
## [670] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [673] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [676] "AnnotationDbi" "gridExtra" "tximport"
## [679] "limma" "DESeq2" "SummarizedExperiment"
## [682] "DelayedArray" "BiocParallel" "matrixStats"
## [685] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [688] "IRanges" "S4Vectors" "BiocGenerics"
## [691] "parallel" "stats4" "stats"
## [694] "graphics" "grDevices" "utils"
## [697] "datasets" "methods" "base"
## [700] "knitr" "pathview" "org.Hs.eg.db"
## [703] "org.Mm.eg.db" "DOSE" "clusterProfiler"
## [706] "VennDiagram" "futile.logger" "dplyr"
## [709] "readr" "RColorBrewer" "gplots"
## [712] "mixOmics" "MASS" "reshape"
## [715] "lattice" "ggplot2" "grid"
## [718] "EnsDb.Mmusculus.v79" "ensembldb" "AnnotationFilter"
## [721] "GenomicFeatures" "AnnotationDbi" "gridExtra"
## [724] "tximport" "limma" "DESeq2"
## [727] "SummarizedExperiment" "DelayedArray" "BiocParallel"
## [730] "matrixStats" "Biobase" "GenomicRanges"
## [733] "GenomeInfoDb" "IRanges" "S4Vectors"
## [736] "BiocGenerics" "parallel" "stats4"
## [739] "stats" "graphics" "grDevices"
## [742] "utils" "datasets" "methods"
## [745] "base"
Set working directory to the folder that contains only gene count txt files
# Generate a tx2gene object for matrix generation
edb <- EnsDb.Mmusculus.v79
transcriptsID <- as.data.frame(transcripts(edb))
tx2gene <- as.data.frame(cbind(transcriptsID$tx_id, transcriptsID$gene_id))
# Generate DESeqData using the counting result generated using Salmon
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Gene Counts/TNF")
inDir = getwd()
countFiles = list.files(inDir, pattern=".sf$", full.names = TRUE)
basename(countFiles)
## [1] "I_MKI1_3351_LIB038588_GEN00143067_R1.sf"
## [2] "I_MKI2_3638_LIB041682_GEN00156310_R1.sf"
## [3] "I_MKI3_3623_LIB041682_GEN00156311_R1.sf"
## [4] "I_MKI4_3838_LIB041682_GEN00156313_R1.sf"
## [5] "I_MKI5_3839_LIB041682_GEN00156314_R1.sf"
## [6] "I_MKI6_3841_LIB041682_GEN00156315_R1.sf"
## [7] "I_V1_3373_LIB038588_GEN00143070_R1.sf"
## [8] "I_V2_3378_LIB038588_GEN00143074_R1.sf"
## [9] "I_V3_3994_LIB038588_GEN00143084_R1.sf"
## [10] "I_V4_3996_LIB038588_GEN00143085_R1.sf"
## [11] "I_V5_3624_LIB041682_GEN00156312_R1.sf"
## [12] "NI_MKI1_3352_LIB038588_GEN00143068_R1.sf"
## [13] "NI_MKI2_3906_LIB038588_GEN00143076_R1.sf"
## [14] "NI_MKI3_3925_LIB038588_GEN00143080_R1.sf"
## [15] "NI_MKI4_3946_LIB038588_GEN00143082_R1.sf"
## [16] "NI_MKI5_3374_LIB038588_GEN00143071_R1.sf"
## [17] "NI_V1_3376_LIB038588_GEN00143072_R1.sf"
## [18] "NI_V2_3377_LIB038588_GEN00143073_R1.sf"
## [19] "NI_V3_3904_LIB038588_GEN00143075_R1.sf"
## [20] "NI_V4_3922_LIB038588_GEN00143078_R1.sf"
## [21] "NI_V5_3924_LIB038588_GEN00143079_R1.sf"
names(countFiles) <- c('I_MKI_1','I_MKI_2','I_MKI_3','I_MKI_4','I_MKI_5','I_MKI_6', 'I_V_1', 'I_V_2', 'I_V_3', 'I_V_4', 'I_V_5', 'NI_MKI_1','NI_MKI_2','NI_MKI_3','NI_MKI_4', 'NI_MKI_5', 'NI_V_1', 'NI_V_2', 'NI_V_3', 'NI_V_4', 'NI_V_5')
txi.salmon <- tximport(countFiles, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable_all <- data.frame(condition = c(rep('experimental', 11), rep('control', 10)))
DESeqsampletable_all$batch <- factor(c('2','3','3','3','3','3','2','2','2','2','3','2','2','2','2','2','2','2','2','2','2'))
DESeqsampletable_all$gender <- factor(c('M', 'F', 'F', 'F', 'M', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'M', 'M', 'M', 'F', 'F', 'F', 'M', 'M', 'M'))
rownames(DESeqsampletable_all) <- colnames(txi.salmon$counts)
ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable_all, ~ condition + batch + gender)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- estimateSizeFactors(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
ddsHTSeq_norm <- DESeq(ddsHTSeq_norm)
## using pre-existing normalization factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 73 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis)
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
MA plot was generated to inspect the correct shrinkage of LFC.
DESeq2::plotMA(ddsHTSeq_analysis)
Data is transformed and pseudocount is calculated.
rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
grid.arrange(
ggplot(pseudoCount, aes(x= I_MKI_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_1"),
ggplot(pseudoCount, aes(x= I_MKI_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_2"),
ggplot(pseudoCount, aes(x= I_MKI_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_3"),
ggplot(pseudoCount, aes(x= I_MKI_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_4"),
ggplot(pseudoCount, aes(x= I_MKI_5)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_5"),
ggplot(pseudoCount, aes(x= I_MKI_6)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_6"), nrow = 2)
grid.arrange(
ggplot(pseudoCount, aes(x= I_V_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_1"),
ggplot(pseudoCount, aes(x= I_V_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_2"),
ggplot(pseudoCount, aes(x= I_V_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_3"),
ggplot(pseudoCount, aes(x= I_V_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_4"),
ggplot(pseudoCount, aes(x= I_V_5)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_5"), nrow = 2)
grid.arrange(
ggplot(pseudoCount, aes(x= NI_MKI_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_1"),
ggplot(pseudoCount, aes(x= NI_MKI_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_2"),
ggplot(pseudoCount, aes(x= NI_MKI_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_3"),
ggplot(pseudoCount, aes(x= NI_MKI_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_4"),
ggplot(pseudoCount, aes(x= NI_MKI_5)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_5"), nrow = 2)
grid.arrange(
ggplot(pseudoCount, aes(x= NI_V_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_1"),
ggplot(pseudoCount, aes(x= NI_V_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_2"),
ggplot(pseudoCount, aes(x= NI_V_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_3"),
ggplot(pseudoCount, aes(x= NI_V_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_4"),
ggplot(pseudoCount, aes(x= NI_V_5)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_5"), nrow = 2)
Check on the gene count distribution across all genes.
#Boxplots
suppressMessages(df <- melt(pseudoCount, variable_name = "Samples"))
df <- data.frame(df, Condition = substr(df$Samples,1,4))
ggplot(df, aes(x=Samples, y=value, fill = Condition)) + geom_boxplot() + xlab("") +
ylab(expression(log[2](count+1))) + scale_fill_manual(values = c("#619CFF", "#F564E3", "#E69F00", "#FF0000")) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
#Histograms and density plots
ggplot(df, aes(x=value, colour = Samples, fill = Samples)) + ylim(c(0, 0.25)) +
geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
theme(legend.position = "top") + xlab(expression(log[2](count+1)))
This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf
keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
if (sum(rawCountTable[i,1:6] != 0) >=2 | sum(rawCountTable[i,7:11] != 0) >= 2 | sum(rawCountTable[i,12:16] != 0) >= 2 | sum(rawCountTable[i,17:21] != 0) >= 2) {
keep <- c(keep, i)
}
}
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
rawCountTable_transform_detected_TNF <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected_TNF, "VST_TNF.csv")
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TNF")
png('Hierchical_Clustering.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
pdf('Hierchical_Clustering.pdf')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
include_graphics('Hierchical_Clustering.png')
I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.
The top 500 most variable genes are selected for PCA analysis.
plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500) +
geom_text(aes(label=name), vjust = 2) +
xlim(-30, 35) + ylim(-20, 20)
Set working directory to the folder that contains only gene count txt files
# Generate DESeqData using the counting result generated using Salmon
countFiles_vcompare <- c(countFiles[17:21], countFiles[7:11])
txi.salmon <- tximport(countFiles_vcompare, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- DESeqsampletable_all[c(17:21, 7:11), ]
ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ condition + batch + gender)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis)
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
MA plot was generated to inspect the correct shrinkage of LFC.
DESeq2::plotMA(ddsHTSeq_analysis)
This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf
rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TNF")
png('Hierchical_Clustering_I_V vs NI_V.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
pdf('Hierchical_Clustering_I_V vs NI_V.pdf')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
include_graphics('Hierchical_Clustering_I_V vs NI_V.png')
Final output is following:
I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.
The top 500 most variable genes are selected for PCA analysis.
(PCA <- plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500))
pdf('PCA_TNF_IV-vs-NIV.pdf',
height = 6,
width = 8)
PCA
dev.off()
## quartz_off_screen
## 2
This step removes all genes with 0 counts across all samples, output a csv file and also generate a density plot using filtered dataset.
keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
if (sum(rawCountTable[i,1:5] != 0) >=2 | sum(rawCountTable[i,6:10] != 0) >= 2) {
keep <- c(keep, i)
}
}
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_I_V vs NI_V.csv")
ggplot(df, aes(x=value, colour = Samples, fill = Samples)) +
geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
theme(legend.position = "top") + xlab("pseudocounts")
This step generates the analysis file that contains results from differential analysis.
write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_I_V vs NI_V.csv")
Genes that were not detected were removed from the list. Genes with padj < 0.05 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.
suppressMessages(library(mosaic))
rawCountTable_transform_detected_iv_vs_niv <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected_iv_vs_niv, "VST_I_V vs NI_V.csv")
dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.05 & abs(dif_analysis$log2FoldChange) > 0.25)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected_iv_vs_niv)))
}
sig_count <- rawCountTable_transform_detected_iv_vs_niv[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
sig_dif[i,7:16] <- zscore(as.numeric(sig_dif[i,7:16]))
}
my_palette <- colorRampPalette(c("blue", "white", "red"))(128)
heatmap_matrix <- as.matrix(sig_dif[,7:16])
png('I_V vs NI_V RNASeq.png',
width = 600,
height = 1200,
res = 200,
pointsize = 8)
heatmap.2(heatmap_matrix,
main = "I_V vs NI_V RNASeq\npadj < 0.1\nLFC > 0.25",
density.info = "none",
key = TRUE,
lhei = c(1,7),
col=my_palette,
cexCol = 1,
margins = c(8,2),
trace = "none",
dendrogram = "row",
labRow = FALSE,
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
pdf('I_V vs NI_V RNASeq.pdf',
width = 6,
height = 10)
heatmap.2(heatmap_matrix,
main = "I_V vs NI_V RNASeq\npadj < 0.1\nLFC > 0.25",
density.info = "none",
key = TRUE,
lhei = c(1,7),
col=my_palette,
cexCol = 1,
margins = c(8,2),
trace = "none",
dendrogram = "row",
labRow = FALSE,
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
include_graphics('I_V vs NI_V RNASeq.png')
# output number of significant DE genes
dim(sig_dif)[1]
## [1] 1856
write.csv(sig_dif, "Significant genes_I_V vs NI_V.csv")
# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$I_V_mean <- rowMeans(detected_pseudocount[,6:10])
dif_analysis$NI_V_mean <- rowMeans(detected_pseudocount[,1:5])
(scatter <- ggplot(dif_analysis, aes(x = NI_V_mean, y = I_V_mean)) +
xlab("NI_V_Average(log2)") + ylab("I_V_Average(log2)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_V vs NI_V Scatter Plot"))
pdf('ScatterPlot_I_V vs NI_V.pdf')
scatter
dev.off()
## quartz_off_screen
## 2
# MA plot
(MA <- ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
xlab("Average Expression") + ylab("LFC") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_V vs NI_V MA Plot"))
pdf('MAPlot_I_V vs NI_V.pdf')
MA
dev.off()
## quartz_off_screen
## 2
# Volcano Plot
(volcano <- ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
xlab("LFC") + ylab("-log10(P value)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_V vs NI_V Volcano Plot"))
## Warning: Removed 42 rows containing missing values (geom_point).
pdf('VolcanoPlot_I_V vs NI_V.pdf')
volcano
## Warning: Removed 42 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen
## 2
Classic GO analysis is performed here for all DE genes detected in this dataset. The reference list is list of genes detected in RNASeq. Three categories of GO terms are tested here, including biological process, molecular function and cellular component.
target_gene <- as.character(rownames(sig_dif))
detected_gene <- as.character(rownames(detected_pseudocount))
# Run GO enrichment analysis for biological process
ego_BP <- enrichGO(gene = target_gene,
universe = detected_gene,
keyType = "ENSEMBL",
OrgDb = org.Mm.eg.db,
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
# Output results from GO analysis to a table
cluster_summary_BP <- data.frame(ego_BP)
write.csv(cluster_summary_BP, "GO analysis/GO analysis_BP_I_V vs NI_V.csv")
# Run GO enrichment analysis for molecular function
ego_MF <- enrichGO(gene = target_gene,
universe = detected_gene,
keyType = "ENSEMBL",
OrgDb = org.Mm.eg.db,
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
# Output results from GO analysis to a table
cluster_summary_MF <- data.frame(ego_MF)
write.csv(cluster_summary_MF, "GO analysis/GO analysis_MF_I_V vs NI_V.csv")
# Run GO enrichment analysis for cellular component
ego_CC <- enrichGO(gene = target_gene,
universe = detected_gene,
keyType = "ENSEMBL",
OrgDb = org.Mm.eg.db,
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
# Output results from GO analysis to a table
cluster_summary_CC <- data.frame(ego_CC)
write.csv(cluster_summary_CC, "GO analysis/GO analysis_CC_I_V vs NI_V.csv")
png('GO analysis/GO dotplot_BP_I_V vs NI_V.png',
width = 1600,
height = 1600,
res = 100,
pointsize = 8)
dotplot(ego_BP, showCategory=50)
dev.off()
## quartz_off_screen
## 2
Final output is following:
png('GO analysis/GO dotplot_MF_I_V vs NI_V.png',
width = 1200,
height = 1600,
res = 100,
pointsize = 8)
dotplot(ego_MF, showCategory=50)
dev.off()
## quartz_off_screen
## 2
Final output is following:
png('GO analysis/GO dotplot_CC_I_V vs NI_V.png',
width = 1200,
height = 1600,
res = 100,
pointsize = 8)
dotplot(ego_CC, showCategory=50)
dev.off()
## quartz_off_screen
## 2
Final output is following: #### Draw enrichment Go plot representing the results ##### Biological process
png('GO analysis/GO enrichment_BP_I_V vs NI_V.png',
width = 1200,
height = 1200,
res = 100,
pointsize = 8)
emapplot(ego_BP, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
Final output is following: ##### Molecular function
png('GO analysis/GO enrichment_MF_I_V vs NI_V.png',
width = 1200,
height = 1200,
res = 100,
pointsize = 8)
emapplot(ego_MF, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
Final output is following: ##### Cellular component
png('GO analysis/GO enrichment_CC_I_V vs NI_V.png',
width = 1200,
height = 1200,
res = 100,
pointsize = 8)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
Final output is following: #### Draw category netplot representing the results The category netplot shows the relationships between the genes associated with the top five most significant GO terms and the fold changes of the significant genes associated with these terms (color). ##### Biological process
OE_foldchanges <- sig_dif$log2FoldChange
names(OE_foldchanges) <- rownames(sig_dif)
png('GO analysis/GO cnetplot_BP_I_V vs NI_V.png',
width = 1600,
height = 1600,
res = 100,
pointsize = 8)
cnetplot(ego_BP,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=6)
dev.off()
## quartz_off_screen
## 2
Final output is following: ##### Molecular function
png('GO analysis/GO cnetplot_MF_I_V vs NI_V.png',
width = 1600,
height = 1600,
res = 100,
pointsize = 8)
cnetplot(ego_MF,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=6)
dev.off()
## quartz_off_screen
## 2
Final output is following: ##### Cellular component
png('GO analysis/GO cnetplot_CC_I_V vs NI_V.png',
width = 1600,
height = 1600,
res = 100,
pointsize = 8)
cnetplot(ego_CC,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=6)
dev.off()
## quartz_off_screen
## 2
Final output is following:
Set working directory to the folder that contains only gene count txt files
# Generate DESeqData using the counting result generated using Salmon
countFiles_icompare <- c(countFiles[7:11], countFiles[1:6])
txi.salmon <- tximport(countFiles_icompare, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- DESeqsampletable_all[c(7:11, 1:6), ]
DESeqsampletable$condition <- factor(c(rep('control', 5), rep('experimental', 6)))
ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ condition + batch + gender)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis)
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
MA plot was generated to inspect the correct shrinkage of LFC.
DESeq2::plotMA(ddsHTSeq_analysis)
This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf
rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TNF")
png('Hierchical_Clustering_I_MKI vs I_V.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
pdf('Hierchical_Clustering_I_MKI vs I_V.pdf')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
include_graphics('Hierchical_Clustering_I_MKI vs I_V.png')
I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.
The top 500 most variable genes are selected for PCA analysis.
(PCA <- plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500))
pdf('PCA_TNF_IMKi-vs-IV.pdf',
height = 6,
width = 8)
PCA
dev.off()
## quartz_off_screen
## 2
This step removes all genes with less than 50 counts across all control or experimental samples, output a csv file and also generate a density plot using filtered dataset.
keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
if (sum(rawCountTable[i,1:5] != 0) >=2 | sum(rawCountTable[i,6:11] != 0) >= 2) {
keep <- c(keep, i)
}
}
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_I_MKI vs I_V.csv")
ggplot(df, aes(x=value, colour = Samples, fill = Samples)) +
geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
theme(legend.position = "top") + xlab("pseudocounts")
This step generates the analysis file that contains results from differential analysis.
write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_I_MKI vs I_V.csv")
Genes that were not detected were removed from the list. Genes with padj < 0.1 (I decided to relax the padj cutoff here as well to keep it consistent with the TCT model analysis) were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.
suppressMessages(library(mosaic))
rawCountTable_transform_detected <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected, "VST_I_MKI vs I_V.csv")
dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.25)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected)))
}
sig_count <- rawCountTable_transform_detected[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
sig_dif[i,7:17] <- zscore(as.numeric(sig_dif[i,7:17]))
}
heatmap_matrix <- as.matrix(sig_dif[,7:17])
png('I_MKI vs I_V RNASeq.png',
width = 600,
height = 1200,
res = 200,
pointsize = 8)
heatmap.2(heatmap_matrix,
main = "I_MKI vs I_V RNASeq\npadj < 0.1\nLFC > 0.25",
density.info = "none",
key = TRUE,
lhei = c(1,7),
col=my_palette,
cexCol = 1,
margins = c(8,2),
trace = "none",
dendrogram = "row",
labRow = FALSE,
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
pdf('I_MKI vs I_V RNASeq.pdf',
width = 6,
height = 10)
heatmap.2(heatmap_matrix,
main = "I_MKI vs I_V RNASeq\npadj < 0.1\nLFC > 0.25",
density.info = "none",
key = TRUE,
lhei = c(1,7),
col=my_palette,
cexCol = 1,
margins = c(8,2),
trace = "none",
dendrogram = "row",
labRow = FALSE,
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
include_graphics('I_MKI vs I_V RNASeq.png')
# output number of significant DE genes
dim(sig_dif)[1]
## [1] 18
write.csv(sig_dif, "Significant genes_I_MKI vs I_V.csv")
# rearrange the TCT master count matrix
rawCountTable_transform_detected_TNF <- read_csv("VST_TNF.csv", )
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## X1 = col_character()
## )
## See spec(...) for full column specifications.
row_names <- rawCountTable_transform_detected_TNF$X1
rawCountTable_transform_detected_TNF$X1 <- NULL
rawCountTable_transform_detected_reshape <- cbind(rawCountTable_transform_detected_TNF[,17:21], rawCountTable_transform_detected_TNF[,7:11], rawCountTable_transform_detected_TNF[,1:6])
rownames(rawCountTable_transform_detected_reshape) <- row_names
# plot the heatmap
dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.25)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected_reshape)))
}
sig_count <- rawCountTable_transform_detected_reshape[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
sig_dif[i,7:22] <- zscore(as.numeric(sig_dif[i,7:22]))
}
heatmap_matrix <- as.matrix(sig_dif[,7:22])
png('TNF MK2i targets in TNF NI_V vs I_V vs I_MK2i RNASeq.png',
width = 600,
height = 700,
res = 100,
pointsize = 8)
heatmap.2(heatmap_matrix,
main = "TNF MK2i targets\nTNF I_V vs NI_V vs I_MK2i\npadj < 0.1 LFC > 0.25",
density.info = "none",
key = TRUE,
lhei = c(1,7),
lwid = c(1,5),
col=my_palette,
cexCol = 1,
margins = c(8,3),
labRow = FALSE,
trace = "none",
dendrogram = "row",
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
pdf('TNF MK2i targets in TNF NI_V vs I_V vs I_MK2i RNASeq.pdf',
width = 9,
height = 11)
heatmap.2(heatmap_matrix,
main = "TNF MK2i targets\nTNF I_V vs NI_V vs I_MK2i\npadj < 0.1 LFC > 0.25",
density.info = "none",
key = TRUE,
lhei = c(1,7),
lwid = c(1,5),
col=my_palette,
cexCol = 1,
margins = c(8,3),
labRow = FALSE,
trace = "none",
dendrogram = "row",
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
include_graphics('TNF MK2i targets in TNF NI_V vs I_V vs I_MK2i RNASeq.png')
# rearrange the TCT master count matrix
rawCountTable_transform_detected_TCT <- read_csv("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TCT/VST_TCT.csv", )
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## X1 = col_character()
## )
## See spec(...) for full column specifications.
row_names <- rawCountTable_transform_detected_TCT$X1
rawCountTable_transform_detected_TCT$X1 <- NULL
rawCountTable_transform_detected_reshape <- cbind(rawCountTable_transform_detected_TCT[,19:23], rawCountTable_transform_detected_TCT[,7:12], rawCountTable_transform_detected_TCT[,1:6])
rownames(rawCountTable_transform_detected_reshape) <- row_names
# plot the heatmap
dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.25)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected_reshape)))
}
sig_count <- rawCountTable_transform_detected_reshape[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
sig_dif[i,7:23] <- zscore(as.numeric(sig_dif[i,7:23]))
}
heatmap_matrix <- as.matrix(sig_dif[,7:23])
png('TNF MK2i targets in TCT NI_V vs I_V vs I_MK2i RNASeq.png',
width = 600,
height = 700,
res = 100,
pointsize = 8)
heatmap.2(heatmap_matrix,
main = "TNF MK2i targets\nTCT I_V vs NI_V vs I_MK2i\npadj < 0.1 LFC > 0.25",
density.info = "none",
key = TRUE,
lhei = c(1,7),
lwid = c(1,5),
col=my_palette,
cexCol = 1,
margins = c(8,3),
labRow = FALSE,
trace = "none",
dendrogram = "row",
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
pdf('TNF MK2i targets in TCT NI_V vs I_V vs I_MK2i RNASeq.pdf',
width = 9,
height = 11)
heatmap.2(heatmap_matrix,
main = "TNF MK2i targets\nTCT NI_V vs I_V vs I_MK2i\npadj < 0.1 LFC > 0.25",
density.info = "none",
key = TRUE,
lhei = c(1,7),
lwid = c(1,5),
col=my_palette,
cexCol = 1,
margins = c(8,3),
labRow = FALSE,
trace = "none",
dendrogram = "row",
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
include_graphics('TNF MK2i targets in TCT NI_V vs I_V vs I_MK2i RNASeq.png')
# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$I_MKI_mean <- rowMeans(detected_pseudocount[,6:11])
dif_analysis$I_V_mean <- rowMeans(detected_pseudocount[,1:5])
(scatter <- ggplot(dif_analysis, aes(x = I_V_mean, y = I_MKI_mean)) +
xlab("I_V_Average(log2)") + ylab("I_MKI_Average(log2)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_MKI vs I_V Scatter Plot"))
pdf('ScatterPlot_I_MKi vs I_V.pdf')
scatter
dev.off()
## quartz_off_screen
## 2
# MA plot
(MA <- ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
xlab("Average Expression") + ylab("LFC") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_MKI vs I_V MA Plot"))
pdf('MAPlot_I_MKi vs I_V.pdf')
MA
dev.off()
## quartz_off_screen
## 2
# Volcano Plot
(volcano <- ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
xlab("LFC") + ylab("-log10(P value)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_MKI vs I_V Volcano Plot"))
## Warning: Removed 29 rows containing missing values (geom_point).
pdf('VolcanoPlot_I_MKi vs I_V.pdf')
volcano
## Warning: Removed 29 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen
## 2
Set working directory to the folder that contains only gene count txt files
# Generate DESeqData using the counting result generated using Salmon
countFiles_icompare <- c(countFiles[17:21], countFiles[12:16])
txi.salmon <- tximport(countFiles_icompare, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- DESeqsampletable_all[c(17:21, 12:16), ]
DESeqsampletable$condition <- factor(c(rep('control', 5), rep('experimental', 5)))
DESeqsampletable$batch <- NULL
ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ condition + gender)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis)
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
MA plot was generated to inspect the correct shrinkage of LFC.
DESeq2::plotMA(ddsHTSeq_analysis)
This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf
rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TNF")
png('Hierchical_Clustering_NI_MKI vs NI_V.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
pdf('Hierchical_Clustering_NI_MKI vs NI_V.pdf')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
include_graphics('Hierchical_Clustering_NI_MKI vs NI_V.png')
I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.
The top 500 most variable genes are selected for PCA analysis.
(PCA <- plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500))
pdf('PCA_TNF_NIMKi-vs-NIV.pdf',
width = 8,
height = 6)
PCA
dev.off()
## quartz_off_screen
## 2
This step removes all genes with less than 50 counts in average across all control or experimental samples, output a csv file and also generate a density plot using filtered dataset.
keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
if (sum(rawCountTable[i,1:5] != 0) >=2 | sum(rawCountTable[i,6:10] != 0) >= 2) {
keep <- c(keep, i)
}
}
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_NI_MKI vs NI_V.csv")
ggplot(df, aes(x=value, colour = Samples, fill = Samples)) +
geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
theme(legend.position = "top") + xlab("pseudocounts")
This step generates the analysis file that contains results from differential analysis.
write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_NI_MKI vs NI_V.csv")
Genes that were not detected were removed from the list. Genes with padj < 0.1 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.
suppressMessages(library(mosaic))
rawCountTable_transform_detected <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected, "VST_NI_MKI vs NI_V.csv")
dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.25)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected)))
}
sig_count <- rawCountTable_transform_detected[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
sig_dif[i,7:16] <- zscore(as.numeric(sig_dif[i,7:16]))
}
heatmap_matrix <- as.matrix(sig_dif[,7:16])
png('NI_MKI vs NI_V RNASeq.png',
width = 600,
height = 1200,
res = 200,
pointsize = 8)
heatmap.2(heatmap_matrix,
main = "NI_MKI vs NI_V RNASeq\npadj < 0.1\nLFC > 0.25",
density.info = "none",
key = TRUE,
lhei = c(1,7),
col=my_palette,
cexCol = 1,
margins = c(8,2),
trace = "none",
dendrogram = "row",
labRow = FALSE,
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
pdf('NI_MKI vs NI_V RNASeq.pdf',
width = 6,
height = 10)
heatmap.2(heatmap_matrix,
main = "NI_MKI vs NI_V RNASeq\npadj < 0.1\nLFC > 0.25",
density.info = "none",
key = TRUE,
lhei = c(1,7),
col=my_palette,
cexCol = 1,
margins = c(8,2),
trace = "none",
dendrogram = "row",
labRow = FALSE,
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
include_graphics('NI_MKI vs NI_V RNASeq.png')
# output number of significant DE genes
dim(sig_dif)[1]
## [1] 20
write.csv(sig_dif, "Significant genes_NI_MKI vs NI_V.csv")
# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$NI_MKI_mean <- rowMeans(detected_pseudocount[,6:10])
dif_analysis$NI_V_mean <- rowMeans(detected_pseudocount[,1:5])
(scatter <- ggplot(dif_analysis, aes(x = NI_V_mean, y = NI_MKI_mean)) +
xlab("NI_V_Average(log2)") + ylab("NI_MKI_Average(log2)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "NI_MKI vs NI_V Scatter Plot"))
pdf('ScatterPlot_NI_MKi vs NI_V.pdf')
scatter
dev.off()
## quartz_off_screen
## 2
# MA plot
(MA <- ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
xlab("Average Expression") + ylab("LFC") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "NI_MKI vs NI_V MA Plot"))
pdf('MAPlot_NI_MKi vs NI_V.pdf')
MA
dev.off()
## quartz_off_screen
## 2
# Volcano Plot
(volcano <- ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
xlab("LFC") + ylab("-log10(P value)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "black") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "NI_MKI vs NI_V Volcano Plot"))
## Warning: Removed 53 rows containing missing values (geom_point).
pdf('VolcanoPlot_NI_MKi vs NI_V.pdf')
volcano
## Warning: Removed 53 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen
## 2
Set working directory to the folder that contains only gene count txt files
# Generate DESeqData using the counting result generated using Salmon
countFiles_all <- c(countFiles[17:21], countFiles[7:11], countFiles[12:16], countFiles[1:6])
txi.salmon <- tximport(countFiles_all, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- DESeqsampletable_all[c(17:21, 7:11, 12:16, 1:6), ]
DESeqsampletable$condition <- factor(c(rep('control', 10), rep('experimental', 11)))
ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ condition + batch + gender)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis)
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
MA plot was generated to inspect the correct shrinkage of LFC.
DESeq2::plotMA(ddsHTSeq_analysis)
This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf
rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TNF")
png('Hierchical_Clustering_MKI vs V.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
Final output is following:
I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.
The top 500 most variable genes are selected for PCA analysis.
plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500)
#geom_text(aes(label=name), vjust = 2) +
#xlim(-30, 20) + ylim(-25, 15)
This step removes all genes with less than 50 counts across all control or experimental samples, output a csv file and also generate a density plot using filtered dataset.
keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
if (sum(rawCountTable[i,1:10] != 0) >=2 | sum(rawCountTable[i,11:21] != 0) >= 2) {
keep <- c(keep, i)
}
}
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_MKI vs V.csv")
ggplot(df, aes(x=value, colour = Samples, fill = Samples)) +
geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
theme(legend.position = "top") + xlab("pseudocounts")
This step generates the analysis file that contains results from differential analysis.
write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_MKI vs V.csv")
Genes that were not detected were removed from the list. Genes with padj < 0.1 (I decided to relax the padj cutoff here as well to keep it consistent with the TCT model analysis) were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.
suppressMessages(library(mosaic))
rawCountTable_transform_detected <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected, "VST_MKI vs V.csv")
dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.1)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected)))
}
sig_count <- rawCountTable_transform_detected[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
sig_dif[i,7:27] <- zscore(as.numeric(sig_dif[i,7:27]))
}
heatmap_matrix <- as.matrix(sig_dif[,7:27])
png('MKI vs V RNASeq.png',
width = 600,
height = 1200,
res = 200,
pointsize = 8)
heatmap.2(heatmap_matrix,
main = "MKI vs V RNASeq\npadj < 0.1\n LFC > 0.1",
density.info = "none",
key = TRUE,
lhei = c(1,7),
col=my_palette,
cexCol = 1,
margins = c(8,2),
trace = "none",
dendrogram = "both",
labRow = FALSE,
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
# output number of significant DE genes
dim(sig_dif)[1]
## [1] 11
write.csv(sig_dif, "Significant genes_MKI vs V.csv")
Final output is
# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$MKI_mean <- rowMeans(detected_pseudocount[,11:21])
dif_analysis$V_mean <- rowMeans(detected_pseudocount[,1:10])
ggplot(dif_analysis, aes(x = V_mean, y = MKI_mean)) +
xlab("V_Average(log2)") + ylab("MKI_Average(log2)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.1), alpha = 0.5, size = 1, color = "blue") +
labs(title = "MKI vs V Scatter Plot")
# MA plot
ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
xlab("Average Expression") + ylab("LFC") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.1), alpha = 0.5, size = 1, color = "blue") +
labs(title = "MKI vs V MA Plot")
# Volcano Plot
ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
xlab("LFC") + ylab("-log10(P value)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "black") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.1), alpha = 0.5, size = 1, color = "blue") +
labs(title = "MKI vs V Volcano Plot")
## Warning: Removed 39 rows containing missing values (geom_point).
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] mosaic_1.6.0 Matrix_1.2-18
## [3] mosaicData_0.17.0 ggformula_0.9.4
## [5] ggstance_0.3.4 knitr_1.28
## [7] pathview_1.26.0 org.Hs.eg.db_3.10.0
## [9] org.Mm.eg.db_3.10.0 DOSE_3.12.0
## [11] clusterProfiler_3.14.3 VennDiagram_1.6.20
## [13] futile.logger_1.4.3 dplyr_0.8.5
## [15] readr_1.3.1 RColorBrewer_1.1-2
## [17] gplots_3.0.3 mixOmics_6.10.9
## [19] MASS_7.3-51.6 reshape_0.8.8
## [21] lattice_0.20-41 ggplot2_3.3.0
## [23] EnsDb.Mmusculus.v79_2.99.0 ensembldb_2.10.2
## [25] AnnotationFilter_1.10.0 GenomicFeatures_1.38.2
## [27] AnnotationDbi_1.48.0 gridExtra_2.3
## [29] tximport_1.14.2 limma_3.42.2
## [31] DESeq2_1.26.0 SummarizedExperiment_1.16.1
## [33] DelayedArray_0.12.3 BiocParallel_1.20.1
## [35] matrixStats_0.56.0 Biobase_2.46.0
## [37] GenomicRanges_1.38.0 GenomeInfoDb_1.22.1
## [39] IRanges_2.20.2 S4Vectors_0.24.4
## [41] BiocGenerics_0.32.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.0.0 RSQLite_2.2.0 htmlwidgets_1.5.1
## [4] munsell_0.5.0 withr_2.2.0 colorspace_1.4-1
## [7] GOSemSim_2.12.1 rstudioapi_0.11 labeling_0.3
## [10] KEGGgraph_1.46.0 urltools_1.7.3 GenomeInfoDbData_1.2.2
## [13] polyclip_1.10-0 bit64_0.9-7 farver_2.0.3
## [16] generics_0.0.2 vctrs_0.2.4 lambda.r_1.2.4
## [19] xfun_0.13 BiocFileCache_1.10.2 R6_2.4.1
## [22] graphlayouts_0.7.0 locfit_1.5-9.4 bitops_1.0-6
## [25] fgsea_1.12.0 gridGraphics_0.5-0 assertthat_0.2.1
## [28] scales_1.1.0 ggraph_2.0.2 nnet_7.3-14
## [31] enrichplot_1.6.1 gtable_0.3.0 tidygraph_1.1.2
## [34] rlang_0.4.5 genefilter_1.68.0 splines_3.6.3
## [37] rtracklayer_1.46.0 lazyeval_0.2.2 acepack_1.4.1
## [40] broom_0.5.6 mosaicCore_0.6.0 europepmc_0.3
## [43] checkmate_2.0.0 BiocManager_1.30.10 yaml_2.2.1
## [46] reshape2_1.4.4 crosstalk_1.1.0.1 backports_1.1.6
## [49] qvalue_2.18.0 Hmisc_4.4-0 tools_3.6.3
## [52] ggplotify_0.0.5 ellipsis_0.3.0 ggdendro_0.1-20
## [55] ggridges_0.5.2 Rcpp_1.0.4 plyr_1.8.6
## [58] base64enc_0.1-3 progress_1.2.2 zlibbioc_1.32.0
## [61] purrr_0.3.4 RCurl_1.98-1.2 prettyunits_1.1.1
## [64] rpart_4.1-15 openssl_1.4.1 viridis_0.5.1
## [67] cowplot_1.0.0 ggrepel_0.8.2 cluster_2.1.0
## [70] magrittr_1.5 data.table_1.12.8 RSpectra_0.16-0
## [73] futile.options_1.0.1 DO.db_2.9 triebeard_0.3.0
## [76] ProtGenerics_1.18.0 hms_0.5.3 evaluate_0.14
## [79] xtable_1.8-4 leaflet_2.0.3 XML_3.99-0.3
## [82] jpeg_0.1-8.1 compiler_3.6.3 biomaRt_2.42.1
## [85] ellipse_0.4.1 tibble_3.0.1 KernSmooth_2.23-17
## [88] crayon_1.3.4 htmltools_0.4.0 corpcor_1.6.9
## [91] Formula_1.2-3 tidyr_1.0.2 geneplotter_1.64.0
## [94] DBI_1.1.0 tweenr_1.0.1 formatR_1.7
## [97] dbplyr_1.4.3 rappdirs_0.3.1 gdata_2.18.0
## [100] igraph_1.2.5 pkgconfig_2.0.3 rvcheck_0.1.8
## [103] GenomicAlignments_1.22.1 foreign_0.8-76 xml2_1.3.2
## [106] rARPACK_0.11-0 annotate_1.64.0 XVector_0.26.0
## [109] stringr_1.4.0 digest_0.6.25 graph_1.64.0
## [112] Biostrings_2.54.0 rmarkdown_2.1 fastmatch_1.1-0
## [115] htmlTable_1.13.3 curl_4.3 Rsamtools_2.2.3
## [118] gtools_3.8.2 nlme_3.1-147 lifecycle_0.2.0
## [121] jsonlite_1.6.1 viridisLite_0.3.0 askpass_1.1
## [124] pillar_1.4.3 KEGGREST_1.26.1 httr_1.4.1
## [127] survival_3.1-12 GO.db_3.10.0 glue_1.4.0
## [130] png_0.1-7 bit_1.1-15.2 Rgraphviz_2.30.0
## [133] ggforce_0.3.1 stringi_1.4.6 blob_1.2.1
## [136] latticeExtra_0.6-29 caTools_1.18.0 memoise_1.1.0